###############################################################################-
# Created By: Pietryka
# Creation Date:  April 16, 2019
# Updated Date: December 30, 2020
# Purpose: Fit models
# Contact: mpietryka@fsu.edu
###############################################################################-


#  1. LOAD PACKAGES & DATA =======================================

## 1A. LOAD PACKAGES    -----------------------------
library(tidyverse)   # DATA CLEANING FUNCTIONS
library(lme4)        # MULTILEVEL MODELS
library(texreg)      # DISPLAY MODELS


## 1B. LOAD DATA    -----------------------------

# 'dyads_df' OBJECT CREATED IN '1-clean-the-data/SC-1- Dyadic Data.R'
dyads_df <- read_rds("../Data/Derived/dyads_df.rds")
attr(dyads_df, "source")



# 'new_to_full_df' OBJECT CREATED IN
# '1-clean-the-data/SC-4- Measure Similarities.R'

# SOURCE (source_from): INNOVATIVE (new)
# FOCAL (source_to):    FULL TEXT

df_new <- read_rds("../Data/Derived/new_to_full_df.rds")   %>%
  left_join(dyads_df)  %>%
  # FOCUS ON RELEVANT YEARS
  filter(date_to > date_from) %>%
  # POST-1776
  filter(year_to >= 1776)


# THREE-GRAMS
df_new3 <- read_rds("../Data/Derived/new_to_full_df3.rds")   %>%
  left_join(dyads_df)  %>%
  # FOCUS ON RELEVANT YEARS
  filter(date_to > date_from) %>%
  # POST-1776
  filter(year_to >= 1776)


# SEVEN-GRAMS
df_new7 <- read_rds("../Data/Derived/new_to_full_df7.rds")   %>%
  left_join(dyads_df)  %>%
  # FOCUS ON RELEVANT YEARS
  filter(date_to > date_from) %>%
  # POST-1776
  filter(year_to >= 1776)


# FIVE-GRAMS WITHOUT STEMMING OR REMOVING  STOP WORDS
df_new_nostop <- read_rds("../Data/Derived/new_to_full_df_nostop.rds")   %>%
  left_join(dyads_df)  %>%
  # FOCUS ON RELEVANT YEARS
  filter(date_to > date_from) %>%
  # POST-1776
  filter(year_to >= 1776)


# 2. FORMULAE  =======================================

form_baseline <- scale(ratio) ~
  same_state +
  us_from +
  (1 | to) +
  (1 | from)


form_geo <- scale(ratio) ~
  share_border +
  scale(distance) +
  I(scale(distance)^2) +
  distance_rel +
  I(distance_rel^2) +
  both_south +
  same_state +
  us_from +
  (1 | to) +
  (1 | from)

form_time <- scale(ratio) ~
  year_to +
  year_from +
  scale(time_diff)  +
  I(scale(time_diff)^2) +
  (1 | to) +
  (1 | from)

form_party <- scale(ratio) ~
  factor(party_same, levels = c(0, -1, 1))  +
  (1 | to) +
  (1 | from)

form_all <- scale(ratio) ~
  share_border +
  scale(distance) +
  I(scale(distance)^2) +
  distance_rel +
  I(distance_rel^2) +
  both_south +
  same_state +
  us_from +
  year_to +
  year_from +
  scale(time_diff)  +
  I(scale(time_diff)^2) +
  factor(party_same, levels = c(0, -1, 1))  +
  (1 | to) +
  (1 | from)







#  3. FIT MODELS  =======================================

## 3A. five-gram models -----------

intercept_only <- lmer(
  scale(ratio) ~
    (1 | to) +
    (1 | from)
  ,
  data =  df_new
)

# FULL TEXT - baseline Xs
fit_full_baseline <- lmer(form_baseline, data =  df_new)

# FULL TEXT - Geographic Xs
fit_full_geo <- lmer(form_geo, data =  df_new)

# FULL TEXT - Time Xs
fit_full_time <- lmer(form_time, data =  df_new)


# FULL TEXT - Party Xs
fit_full_party <- lmer(form_party, data =  df_new)


# FULL TEXT - All Xs
fit_full_all <- lmer(form_all,data =  df_new)

## 3B. jaccard model -----------

form_jaccard <- update(form_all, scale(jaccard) ~ . )

fit_jaccard_all5 <- lmer(form_jaccard, data =  df_new)

## 3C. three-gram models -----------

fit_full_all3 <- lmer(form_all,  data =  df_new3)

fit_jaccard_all3 <- lmer( form_jaccard,  data =  df_new3)

## 3D. seven-gram models -----------

fit_full_all7 <- lmer(form_all,  data =  df_new7)

fit_jaccard_all7 <- lmer(form_jaccard,  data =  df_new7)

## 3E. five-gram models without stemming or removing  stop words -----------

fit_full_all_nostop <- lmer(form_all,  data =  df_new_nostop)

fit_jaccard_all_nostop <- lmer(form_jaccard,  data =  df_new_nostop)

#  4. DISPLAY MODELS =======================================


## 4A. collect models in lists --------------------

# MAIN TEXT
model_maintext_list <- list(fit_full_baseline, fit_full_geo, fit_full_time,
                            fit_full_party, fit_full_all)

# MODELS FOR 3-,5-, AND 7- GRAMS
model_ngram_list <- list(fit_full_all3, fit_full_all, fit_full_all7)



mod_names <- paste0("(", seq_along(model_maintext_list), ")")



# 5. SAVE  =======================================

write_rds(model_maintext_list, path = "../Data/Derived/model_maintext_list.rds")
write_rds(model_ngram_list, path = "../Data/Derived/model_ngram_list.rds")
write_rds(fit_jaccard_all5, path = "../Data/Derived/fit_jaccard_all5.rds")
write_rds(fit_full_all_nostop, path = "../Data/Derived/fit_full_all_nostop.rds")


# DISPLAY VERSION NUMBERS FOR R & PACKAGES IN USE
sessionInfo()

